In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
 
In [2]:
sheets=pd.read_excel("Regional Sales Dataset.xlsx",sheet_name=None)
In [3]:
#Assign different dataframes to each sheets 
df_sales=sheets['Sales Orders']
df_customer=sheets['Customers']
df_products=sheets['Products']
df_regions=sheets['Regions']
df_state_reg=sheets['State Regions']
df_budgets=sheets['2017 Budgets']
In [4]:
print(f"df sales shape:{df_sales.shape}")
df sales shape:(64104, 12)
In [5]:
df_state_reg.head()
Out[5]:
Column1 Column2 Column3
0 State Code State Region
1 AL Alabama South
2 AR Arkansas South
3 AZ Arizona West
4 CA California West
In [6]:
new_header=df_state_reg.iloc[0]
df_state_reg.columns=new_header
df_state_reg=df_state_reg[1:].reset_index(drop=True)
df_state_reg.head()
Out[6]:
State Code State Region
0 AL Alabama South
1 AR Arkansas South
2 AZ Arizona West
3 CA California West
4 CO Colorado West
In [7]:
df_budgets.columns
Out[7]:
Index(['Product Name', '2017 Budgets'], dtype='object')

Data Cleaning and Wrangling¶

In [9]:
#merging sales and customer tables by using foreign key customer index
df=df_sales.merge(
    df_customer,
    how='left',
    left_on='Customer Name Index',
    right_on='Customer Index'
)
In [10]:
df.head(1)
Out[10]:
OrderNumber OrderDate Customer Name Index Channel Currency Code Warehouse Code Delivery Region Index Product Description Index Order Quantity Unit Price Line Total Total Unit Cost Customer Index Customer Names
0 SO - 000225 2014-01-01 126 Wholesale USD AXW291 364 27 6 2499.1 14994.6 1824.343 126 Rhynoodle Ltd
In [11]:
#merging with products table
df=df.merge(
    df_products,
    how='left',
    left_on='Product Description Index',
    right_on='Index'
)
In [12]:
df.head(1)
Out[12]:
OrderNumber OrderDate Customer Name Index Channel Currency Code Warehouse Code Delivery Region Index Product Description Index Order Quantity Unit Price Line Total Total Unit Cost Customer Index Customer Names Index Product Name
0 SO - 000225 2014-01-01 126 Wholesale USD AXW291 364 27 6 2499.1 14994.6 1824.343 126 Rhynoodle Ltd 27 Product 27
In [13]:
#merge with Regions
df=df.merge(
    df_regions,
    how='left',
    left_on='Delivery Region Index',
    right_on='id'
)
In [14]:
df.head(1)
Out[14]:
OrderNumber OrderDate Customer Name Index Channel Currency Code Warehouse Code Delivery Region Index Product Description Index Order Quantity Unit Price ... type latitude longitude area_code population households median_income land_area water_area time_zone
0 SO - 000225 2014-01-01 126 Wholesale USD AXW291 364 27 6 2499.1 ... City 32.08354 -81.09983 912 145674 52798 36466 268318796 13908113 America/New York

1 rows × 31 columns

In [15]:
#merge with state_region
df=df.merge(
    df_state_reg[["State Code","Region"]],
    how='left',
    left_on='state_code',
    right_on='State Code'
)
In [16]:
df=df.merge(
    df_budgets,
    how='left',
    on='Product Name'
)
In [17]:
df.head(1)
Out[17]:
OrderNumber OrderDate Customer Name Index Channel Currency Code Warehouse Code Delivery Region Index Product Description Index Order Quantity Unit Price ... area_code population households median_income land_area water_area time_zone State Code Region 2017 Budgets
0 SO - 000225 2014-01-01 126 Wholesale USD AXW291 364 27 6 2499.1 ... 912 145674 52798 36466 268318796 13908113 America/New York GA South 964940.231

1 rows × 34 columns

In [18]:
#repetative column cleanup
#customer_index
#index
#id
#State code
cols_to_drop=['Customer Index','Index','id','State Code']
df=df.drop(columns=cols_to_drop)
df.head(1)
Out[18]:
OrderNumber OrderDate Customer Name Index Channel Currency Code Warehouse Code Delivery Region Index Product Description Index Order Quantity Unit Price ... longitude area_code population households median_income land_area water_area time_zone Region 2017 Budgets
0 SO - 000225 2014-01-01 126 Wholesale USD AXW291 364 27 6 2499.1 ... -81.09983 912 145674 52798 36466 268318796 13908113 America/New York South 964940.231

1 rows × 30 columns

In [19]:
#convert all column names for easy accessa
df.columns=df.columns.str.lower()
df.columns.values
Out[19]:
array(['ordernumber', 'orderdate', 'customer name index', 'channel',
       'currency code', 'warehouse code', 'delivery region index',
       'product description index', 'order quantity', 'unit price',
       'line total', 'total unit cost', 'customer names', 'product name',
       'name', 'county', 'state_code', 'state', 'type', 'latitude',
       'longitude', 'area_code', 'population', 'households',
       'median_income', 'land_area', 'water_area', 'time_zone', 'region',
       '2017 budgets'], dtype=object)
In [20]:
#now to keep the imp column and delete those columns which we think is not nessacry 
cols_to_keep=[
    'ordernumber', 
    'orderdate',
    'customer names',
    'channel',
    'product name',
    'order quantity',
    'unit price',
    'line total',
    'total unit cost',
    'state_code',
    'county',
    'state',
    'region',
    'latitude',
    'longitude',
    '2017 budgets'
]
In [21]:
df=df[cols_to_keep]
In [22]:
df.head()
Out[22]:
ordernumber orderdate customer names channel product name order quantity unit price line total total unit cost state_code county state region latitude longitude 2017 budgets
0 SO - 000225 2014-01-01 Rhynoodle Ltd Wholesale Product 27 6 2499.1 14994.6 1824.343 GA Chatham County Georgia South 32.08354 -81.09983 964940.231
1 SO - 0003378 2014-01-01 Thoughtmix Ltd Distributor Product 20 11 2351.7 25868.7 1269.918 IN Johnson County Indiana Midwest 39.61366 -86.10665 2067108.120
2 SO - 0005126 2014-01-01 Amerisourc Corp Wholesale Product 26 6 978.2 5869.2 684.740 CA Alameda County California West 37.66243 -121.87468 5685138.270
3 SO - 0005614 2014-01-01 Colgate-Pa Group Export Product 7 7 2338.3 16368.1 1028.852 IN Monroe County Indiana Midwest 39.16533 -86.52639 889737.555
4 SO - 0005781 2014-01-01 Deseret Group Wholesale Product 8 8 2291.4 18331.2 1260.270 CT Hartford County Connecticut Northeast 41.77524 -72.52443 1085037.329
In [23]:
#rename the columns 
df=df.rename(columns={
    'ordernumber':'order_number', 
    'orderdate':'order_date',
    'customernames':'customer_name', 
    'unit price':'unit_price',
    'line total':'revenue',
    'total_unit_cost':'cost', 
    'latitude':'lat',
    'longitude':'lon',
    '2017 budgets':'budget'
})
In [24]:
#blank out budgets for non 2017 orders bcz of 2017 budgets table 
df.loc[df['order_date'].dt.year != 2017,'budget']=pd.NA
df[['order_date','budget']].head(3)
Out[24]:
order_date budget
0 2014-01-01 NaN
1 2014-01-01 NaN
2 2014-01-01 NaN
In [25]:
## Now filter the dataframe to contain only the records of the year 2017
df_2017=df[df['order_date'].dt.year == 2017]
df_2017.isnull().sum()
Out[25]:
order_number       0
order_date         0
customer names     0
channel            0
product name       0
order quantity     0
unit_price         0
revenue            0
total unit cost    0
state_code         0
county             0
state              0
region             0
lat                0
lon                0
budget             0
dtype: int64
In [26]:
df['total_cost']=df['order quantity']*df['total unit cost']
df['profit']=df['revenue']-df['total_cost']
df['profit margin']=df['profit']/df['revenue']*100 
df.head(3)
# 4. Extract full month name from order_date for labeling (e.g., 'January', 'February')
df['order_month_name'] = df['order_date'].dt.month_name()

# 5. Extract month number from order_date for correct sorting (1–12)
df['order_month_num'] = df['order_date'].dt.month

# 6. (Optional) Display the updated DataFrame
df
Out[26]:
order_number order_date customer names channel product name order quantity unit_price revenue total unit cost state_code ... state region lat lon budget total_cost profit profit margin order_month_name order_month_num
0 SO - 000225 2014-01-01 Rhynoodle Ltd Wholesale Product 27 6 2499.1 14994.6 1824.343 GA ... Georgia South 32.08354 -81.09983 NaN 10946.058 4048.542 27.0 January 1
1 SO - 0003378 2014-01-01 Thoughtmix Ltd Distributor Product 20 11 2351.7 25868.7 1269.918 IN ... Indiana Midwest 39.61366 -86.10665 NaN 13969.098 11899.602 46.0 January 1
2 SO - 0005126 2014-01-01 Amerisourc Corp Wholesale Product 26 6 978.2 5869.2 684.740 CA ... California West 37.66243 -121.87468 NaN 4108.440 1760.760 30.0 January 1
3 SO - 0005614 2014-01-01 Colgate-Pa Group Export Product 7 7 2338.3 16368.1 1028.852 IN ... Indiana Midwest 39.16533 -86.52639 NaN 7201.964 9166.136 56.0 January 1
4 SO - 0005781 2014-01-01 Deseret Group Wholesale Product 8 8 2291.4 18331.2 1260.270 CT ... Connecticut Northeast 41.77524 -72.52443 NaN 10082.160 8249.040 45.0 January 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
64099 SO - 0007573 2018-02-28 Dazzlesphe Corp Wholesale Product 26 12 1815.7 21788.4 980.478 PA ... Pennsylvania Northeast 40.15511 -74.82877 NaN 11765.736 10022.664 46.0 February 2
64100 SO - 0007706 2018-02-28 Yombu Corp Export Product 21 6 864.3 5185.8 579.081 IL ... Illinois Midwest 42.11030 -88.03424 NaN 3474.486 1711.314 33.0 February 2
64101 SO - 0007718 2018-02-28 Bath Group Distributor Product 13 11 3953.0 43483.0 2648.510 FL ... Florida South 26.24453 -80.20644 NaN 29133.610 14349.390 33.0 February 2
64102 SO - 0008084 2018-02-28 Linklinks Ltd Distributor Product 20 7 3959.7 27717.9 2930.178 NY ... New York Northeast 42.91002 -78.74182 NaN 20511.246 7206.654 26.0 February 2
64103 SO - 0008654 2018-02-28 SAFEWAY Ltd Distributor Product 15 8 998.3 7986.4 848.555 OR ... Oregon West 45.48706 -122.80371 NaN 6788.440 1197.960 15.0 February 2

64104 rows × 21 columns

In [27]:
df['order_month'] = df['order_date'].dt.to_period('M')

monthly_sales = df.groupby('order_month')['revenue'].sum()

plt.figure(figsize=(15, 4))

monthly_sales.plot(marker='o', color='navy')

from matplotlib.ticker import FuncFormatter
formatter = FuncFormatter(lambda x, pos: f'{x/1e6:.1f}M')
plt.gca().yaxis.set_major_formatter(formatter)

plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Revenue (Millions)')

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [28]:
df_ = df[df['order_date'].dt.year != 2018]   # exclude any partial-year data (i.e., January and February of 2018) so that the monthly totals aren’t skewed by an incomplete year.
                                            #I create df_ so I don’t alter the original df, preserving it intact for other analyses while filtering just for this chart.


# 2. Group by month number and month name, sum revenue, then sort by month number
monthly_sales = (
    df_
    .groupby(['order_month_num', 'order_month_name'])['revenue']
    .sum()
    .sort_index()
)

# 3. Plot setup
from matplotlib.ticker import FuncFormatter

plt.figure(figsize=(13, 4))
plt.plot(
    monthly_sales.index.get_level_values(1),  # X-axis: month names
    monthly_sales.values,                     # Y-axis: total revenue
    marker='o',                                # circle markers
    color='navy'                               # line color
)

# 4. Scale y-axis values to millions for readability
formatter = FuncFormatter(lambda x, pos: f'{x/1e6:.1f}M')
plt.gca().yaxis.set_major_formatter(formatter)

# 5. Add title and axis labels
plt.title('Overall Monthly Sales Trend (Excluding 2018)')
plt.xlabel('Month')
plt.ylabel('Total Revenue (Millions)')

# 6. Rotate x-axis labels for readability
plt.xticks(rotation=45)

# 7. Adjust layout to prevent clipping
plt.tight_layout()

# 8. Display the plot
plt.show()

     
No description has been provided for this image
In [29]:
top_margin = (
    df.groupby('product name')['profit']
      .mean()                        # Calculate mean profit for each product
      .sort_values(ascending=False)  # Sort from highest to lowest average profit
      .head(10)                      # Keep only the top 10 products
)

plt.figure(figsize=(9, 4))

sns.barplot(
    x=top_margin.values,    # X-axis: average profit values
    y=top_margin.index,     # Y-axis: product names
    palette='viridis'       # Color palette for the bars
)

plt.title('Top 10 Products by Avg Profit Margin')  # Chart title
plt.xlabel('Average Profit Margin (USD)')          # X-axis label
plt.ylabel('Product Name')                         # Y-axis label

plt.tight_layout()
plt.show()
C:\Users\prajw\AppData\Local\Temp\ipykernel_2176\601553980.py:10: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image
In [30]:
# Group revenue by sales channel and sort descending
chan_sales = df.groupby('channel')['revenue'].sum().sort_values(ascending=False)

# Set figure size for the pie chart
plt.figure(figsize=(5, 5))

# Plot pie chart with percentage labels and a defined start angle
plt.pie(
    chan_sales.values,                   # Data: revenue values per channel
    labels=chan_sales.index,             # Labels: channel names
    autopct='%1.1f%%',                   # Display percentages with one decimal
    startangle=140,                      # Rotate chart so first slice starts at 140 degrees
    colors=sns.color_palette('coolwarm') # Color palette for slices
)

# Add title for context
plt.title('Total Sales by Channel')

# Adjust layout to ensure everything fits well
plt.tight_layout()

# Display the chart
plt.show()
No description has been provided for this image
In [31]:
# Calculate the total revenue for each order to get the order value
aov = df.groupby('order_number')['revenue'].sum()

# Set the figure size for better visibility
plt.figure(figsize=(12, 4))

# Plot a histogram of order values
plt.hist(
    aov,               # Data: list of order values
    bins=50,           # Number of bins to group order values
    color='skyblue',   # Fill color of the bars
    edgecolor='black'  # Outline color of the bars
)

# Add title and axis labels for context
plt.title('Distribution of Average Order Value')
plt.xlabel('Order Value (USD)')
plt.ylabel('Number of Orders')

# Adjust layout to prevent clipping
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [32]:
# Calculate profit margin percentage for each record
df['profit_margin_pct'] = (df['profit'] / df['revenue']) * 100

# Set the figure size for clarity
plt.figure(figsize=(6,4))

# Plot unit price vs. profit margin percentage
plt.scatter(
    df['unit_price'],            # X-axis: unit price in USD
    df['profit_margin_pct'],     # Y-axis: profit margin percentage
    alpha=0.6,                   # Transparency level for overlapping points
    color='green'                # Point color
)

# Add title and axis labels
plt.title('Profit Margin % vs. Unit Price')  # Chart title
plt.xlabel('Unit Price (USD)')                # X-axis label
plt.ylabel('Profit Margin (%)')               # Y-axis label

# Adjust layout to prevent clipping
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
In [33]:
# Set figure size for clarity
plt.figure(figsize=(12,4))

# Create a boxplot of unit_price by product_name
sns.boxplot(
    data=df,
    x='product name',   # X-axis: product categories
    y='unit_price',      # Y-axis: unit price values
    color='g'            # Box color
)

# Add title and axis labels
plt.title('Unit Price Distribution per Product')  # Chart title
plt.xlabel('Product')                              # X-axis label
plt.ylabel('Unit Price (USD)')                     # Y-axis label

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Adjust layout to prevent clipping of labels
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
In [34]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Aggregate total sales by region (in millions)
region_sales = (
    df
    .groupby('region')['revenue']
    .sum()
    .sort_values(ascending=False)  # so bars go top→bottom
    / 1e6                         # convert to millions
)

# 2. Plot
plt.figure(figsize=(10, 4))
sns.barplot(
    x=region_sales.values,
    y=region_sales.index,
    palette='Greens_r'          # dark→light green
)

# 3. Formatting
plt.title('Total Sales by US Region', fontsize=16, pad=12)
plt.xlabel('Total Sales (in Millions USD)', fontsize=12)
plt.ylabel('US Region', fontsize=12)
plt.xticks(rotation=0)
sns.despine(left=True, bottom=True)

plt.tight_layout() 
plt.show()
C:\Users\prajw\AppData\Local\Temp\ipykernel_2176\1217761970.py:15: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image
In [35]:
import plotly.express as px

# 1. Aggregate revenue by state (in millions)
state_sales = (
    df
    .groupby('state')['revenue']
    .sum()
    .reset_index()
)
state_sales['revenue_m'] = state_sales['revenue'] / 1e6  # convert to millions

# 2. Plotly choropleth
fig = px.choropleth(
    state_sales,
    locations='state',            # column with state codes
    locationmode='USA-states',    # tells Plotly these are US states
    color='revenue_m',
    scope='usa',
    labels={'revenue_m':'Total Sales (M USD)'},
    color_continuous_scale='Blues',
    hover_data={'revenue_m':':.2f'}  # show 2 decimals
)

# 3. Layout tuning
fig.update_layout(
    title_text='Total Sales by State',
    margin=dict(l=0, r=0, t=40, b=0),
    coloraxis_colorbar=dict(
        title='Sales (M USD)',
        ticksuffix='M'
    )
)

fig.show()
In [ ]:
df.to_csv('Sales_data(EDA Exported).csv', index=False)
In [ ]:
df= df[df['order_date'].dt.year != 2018]
In [ ]:
df.to_csv('Sales_data(EDA Exported).csv', index=False)
In [ ]: